0187. 重复的DNA序列【中等】
1. 📝 题目描述
DNA 序列 由一系列核苷酸组成,缩写为 'A','C','G' 和 'T'.。
- 例如,
"ACGAATTCCG"是一个 DNA 序列。
在研究 DNA 时,识别 DNA 中的重复序列非常有用。
给定一个表示 DNA 序列 的字符串 s,返回所有在 DNA 分子中出现不止一次的 长度为 10 的序列(子字符串)。你可以按 任意顺序 返回答案。
示例 1:
txt
输入:s = "AAAAACCCCCAAAAACCCCCCAAAAAGGGTTT"
输出:["AAAAACCCCC","CCCCCAAAAA"]1
2
2
示例 2:
txt
输入:s = "AAAAAAAAAAAAA"
输出:["AAAAAAAAAA"]1
2
2
提示:
0 <= s.length <= 10^5s[i]=='A'、'C'、'G'or'T'
2. 🎯 s.1 - 哈希集合
c
/**
* Note: The returned array must be malloced, assume caller calls free().
*/
#define HASH_SIZE 100003
typedef struct Entry {
char seq[11];
int count;
struct Entry* next;
} Entry;
unsigned int hashStr(const char* s) {
unsigned int h = 0;
for (int i = 0; i < 10; i++)
h = h * 31 + s[i];
return h % HASH_SIZE;
}
char** findRepeatedDnaSequences(char* s, int* returnSize) {
*returnSize = 0;
int len = strlen(s);
if (len < 10) return NULL;
Entry** table = (Entry**)calloc(HASH_SIZE, sizeof(Entry*));
char** res = (char**)malloc(sizeof(char*) * (len - 9));
for (int i = 0; i <= len - 10; i++) {
unsigned int h = hashStr(s + i);
Entry* e = table[h];
Entry* found = NULL;
while (e) {
if (strncmp(e->seq, s + i, 10) == 0) { found = e; break; }
e = e->next;
}
if (found) {
if (found->count == 1) {
res[*returnSize] = (char*)malloc(11);
strncpy(res[*returnSize], s + i, 10);
res[*returnSize][10] = '\0';
(*returnSize)++;
}
found->count++;
} else {
Entry* ne = (Entry*)malloc(sizeof(Entry));
strncpy(ne->seq, s + i, 10);
ne->seq[10] = '\0';
ne->count = 1;
ne->next = table[h];
table[h] = ne;
}
}
// 释放哈希表
for (int i = 0; i < HASH_SIZE; i++) {
Entry* e = table[i];
while (e) { Entry* next = e->next; free(e); e = next; }
}
free(table);
return res;
}1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
js
/**
* @param {string} s
* @return {string[]}
*/
var findRepeatedDnaSequences = function (s) {
const seen = new Set()
const res = new Set()
for (let i = 0; i <= s.length - 10; i++) {
const sub = s.slice(i, i + 10)
if (seen.has(sub)) res.add(sub)
else seen.add(sub)
}
return [...res]
}1
2
3
4
5
6
7
8
9
10
11
12
13
14
2
3
4
5
6
7
8
9
10
11
12
13
14
py
class Solution:
def findRepeatedDnaSequences(self, s: str) -> List[str]:
seen = set()
res = set()
for i in range(len(s) - 9):
sub = s[i:i + 10]
if sub in seen:
res.add(sub)
else:
seen.add(sub)
return list(res)1
2
3
4
5
6
7
8
9
10
11
2
3
4
5
6
7
8
9
10
11
- 时间复杂度:
,其中 是字符串长度 - 空间复杂度:
,哈希集合存储所有子串
算法思路:
- 用滑动窗口遍历所有长度为 10 的子串
- 用
seen集合记录已出现的子串,用res集合收集重复出现的子串